################################################################################
################################################################################
################################################################################
###########Create Control Variable for Debate Topics############################
library(quanteda)
library(keyATM)

################################################################################
#### Depending on where the speech was recorded (UCD / Linked Data) the debate 
####variable is either the title of the debate or an alphanumeric code

### Prepare Corpus to run keyATM ####
load("Data/EP_complete_dict_110323.Rdata")
corp_debates <- corpus(EP_complete_dict, docid_field = "doc_id", unique_docnames = FALSE, text_field = "text")
ndoc(corp_debates)

toks_debates <- tokens(corp_debates, remove_punct = TRUE, tolower = TRUE) %>% 
  tokens_remove(pattern = c(stopwords("english"), "+", "<", "u", ">", "00a0", "00e1", "00e9", "eur")) %>%
  tokens_select(min_nchar = 3) %>%
  tokens_group(groups = debate)

dfmat_debates <- dfm(toks_debates, tolower = TRUE) %>%
  dfm_trim(max_termfreq = 100000,
           min_termfreq = 100,
           termfreq_type = "count" )


topfeatures(dfmat_debates,100)
nfeat(dfmat_debates)
ndoc(dfmat_debates)
dfmat_debates <- dfm_subset(dfmat_debates, ntoken(dfmat_debates)>0)

### IMPLEMENT KEY ATM BASED ON CAP KEYWORDS #######################
#### if this takes too long in the replication you can also load the output below####
#load("NLP_Model/out50_05112021.Rdata")


keyATM_docs <- keyATM_read(dfmat_debates)
keywords <- list(Macro = c("inflation", "cost", "price", "rate", "unemployment", "money", "monetary", "debt", "budget", "tax", "industry"),
                 Civil = c("civil", "civic", "minority", "discrimination", "freedom", "privacy"),
                 Health = c("health", "healthcare", "insurance", "pharmaceutical", "care", "prescription", "treatment", "research"),
                 Agriculture = c("agriculture", "farmers", "farms", "farming", "food", "seafood", "fish", "animal", "crop", "fisheries", "fishing"),
                 Labour = c("labour", "workers", "training", "benefits", "unions", "unions", "employment", "seasonal"),
                 Education = c("education", "elementary", "secondary", "schools", "educational", "student", "underprivileged", "vocational"),
                 Environment = c("environment", "environmental", "water", "waste", "pollution", "air", "climate", "recycling", "forest", "species", "conservation"),
                 Energy = c("energy", "nuclear", "electricity", "electrical", "gas", "oil", "coal", "renewable", "efficiency"),
                 Immigration = c("immigration", "refugees", "citizenship", "migration", "asylum"),
                 Transport = c("transport", "highway", "aviation", "airports", "rail", "travel", "freight", "infrastructure"),
                 Law = c("law", "crime", "enforcement","borders", "illegal", "court", "prison", "abuse", "family", "violence", "police", "security", "terrorism"),
                 Welfare = c("welfare", "social", "elderly", "assistance", "charities", "youth"),
                 Housing = c("housing", "houses", "urban", "community", "neighbourhood", "rural", "homeless", "rent"),
                 Commerce = c("commerce", "banking", "financial", "trading", "commodities", "investments", "finance", "regulation", "bankruptcy", "corporate", "business","tourism", "consumer"),
                 Defense = c("defense", "defence", "alliance", "nato", "peacekeeping", "intelligence", "nuclear", "arms", "espionage", "military", "forces"),
                 Technology = c("science", "technology", "science", "broadcast", "forecasting", "computer"),
                 Trade = c("trade", "foreign", "exports", "export", "negotiations", "import", "imports", "exchange"),
                 International = c("international", "aid", "foreign", "affairs", "developing", "world", "ngo", "unesco", "diplomacy", "diplomats"),
                 Government = c("government", "local", "intergovernmental", "appointment", "administration", "procurement", "capital", "holidays"),
                 Lands = c("lands", "territorial", "indigenous", "public"),
                 Culture = c("culture", "cultural", "sports", "theatre", "arts"))

key_viz <- visualize_keywords(docs = keyATM_docs, keywords = keywords)
key_viz
plot_pi(out50)

out50 <- keyATM(docs              = keyATM_docs,    # text input
                no_keyword_topics = 5,              # number of topics without keywords
                keywords          = keywords,       # keywords
                model             = "base",         # select the model
                options           = list(seed = 250,
                                         iterations = 1500))

top_words(out50,20)


#MERGING TOPICMODEL WITH CORPUS


debatetopdis <- data.frame(out50$theta)
dfmat_debates <- dfm_subset(dfmat_debates, ntoken(dfmat_debates) > 0)
preds_de <- cbind(debatetopdis, dfmat_debates@docvars$debate)
colnames(preds_de)[27] <- "debate"

EP_debates <- merge(EP_complete_dict, preds_de, by = "debate")


colnames(EP_debates)[82:107] <- c("de_macro", "de_civil", "de_health", "de_agri", "de_labour", "de_edu", "de_envi", "de_energy", "de_immi", "de_transport", "de_law", "de_welfare", "de_housing", "de_commerce", "de_defense", "de_techno", "de_trade", "de_intern", "de_govern", "de_lands", "de_culture", "proced_1", "proced_2", "proced_3", "proced_4", "proced_5") 
EP_debates[,77] <- NULL 

save(EP_debates, file ="Data/EP_debates_11032023.Rdata")


##### CHECKED AND RUN ON 11.03.2023
